In this notebook, we wlll train a single-layer network for a super easy task. The goal is to show how Dense layer is working.
We will use a synthesized data here. It consists of a pure sinusoid on 12 differnt pitches. Then we will compute a CQT and feed one of the CQT frames into the network.
%matplotlib inline
import numpy as np
import librosa
import keras
from future.utils import implements_iterator # for python 2 compatibility for __next__()
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
plt.rc('figure', titlesize=20)
plt.rc('font', size=20)
plt.rc('xtick', labelsize=12)
Using TensorFlow backend.
def sin_wave(secs, freq, sr, gain):
'''
Generates a sine wave of frequency given by freq, with duration of secs.
'''
t = np.arange(sr * secs)
return gain * np.sin(2 * np.pi * freq * t / sr)
def whitenoise(gain, shape):
'''
Generates white noise of duration given by secs
'''
return gain * np.random.uniform(-1., 1., shape)
class DataGen:
def __init__(self, sr=16000, batch_size=128):
np.random.seed(1209)
self.pitches = [440., 466.2, 493.8, 523.3, 554.4, 587.3,
622.3, 659.3, 698.5, 740., 784.0, 830.6]
self.sr = sr
self.n_class = len(self.pitches) # 12 pitches
self.secs = 1.
self.batch_size = batch_size
self.sins = []
self.labels = np.eye(self.n_class)[range(0, self.n_class)] # 1-hot-vectors
for freq in self.pitches:
cqt = librosa.cqt(sin_wave(self.secs, freq, self.sr, gain=0.5), sr=sr,
fmin=220, n_bins=36, filter_scale=2)[:, 1] # use only one frame!
cqt = librosa.amplitude_to_db(cqt, ref=np.min)
cqt = cqt / np.max(cqt)
self.sins.append(cqt)
self.cqt_shape = cqt.shape # (36, )
def __next__(self):
choice = np.random.choice(12, size=self.batch_size, # pick pitches for this batch
replace=True)
noise_gain = 0.1 * np.random.random_sample(1) # a random noise gain
noise = whitenoise(noise_gain, self.cqt_shape) # generate white noise
xs = [noise + self.sins[i] for i in choice] # compose a batch with additive noise
ys = [self.labels[i] for i in choice] # corresponding labels
return np.array(xs, dtype=np.float32), np.array(ys, dtype=np.float32)
next = __next__
datagen = DataGen()
print("Input: A frame of CQT in a shape of: {}".format(datagen.cqt_shape))
x, y = next(datagen)
print("Input batch: CQT frames, {}".format(x.shape))
print("Number of classes (pitches): {}".format(datagen.n_class))
plt.figure(figsize=(20, 6))
for i in range(2):
x, y = next(datagen)
plt.subplot(2, 2, i+1)
plt.imshow(x.transpose(), cmap=plt.get_cmap('Blues'))
plt.xlabel('data sample index')
plt.ylabel('pitch index')
plt.title('Batch {} (x, input)'.format(i+1))
plt.subplot(2, 2, i+3)
plt.imshow(y.transpose(), cmap=plt.get_cmap('Blues'))
plt.title('Batch {} (y, label)'.format(i+1))
print('')
Input: A frame of CQT in a shape of: (36,) Input batch: CQT frames, (128, 36) Number of classes (pitches): 12
Each subplot is a visualisation of each batch. The pitch ranges only in [440 Hz, 830 Hz] (A4 - G#5) but the CQT covers wider range of frequencies (3 octaves from 220 Hz to 880 Hz)
val_datagen = DataGen() # this is a generator for validation set
The model is very simple. No bias, only single dense layer, which will connect a 36-dim input to a 12-dim output.
model = keras.models.Sequential()
model.add(keras.layers.Dense(datagen.n_class, use_bias=False,
input_shape=datagen.cqt_shape)) # A dense layer (36 input nodes --> 12 output nodes)
model.add(keras.layers.Activation('softmax')) # Softmax because it's single-label classification
model.compile(optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.9, # a pretty standard optimizer
decay=1e-6, nesterov=True),
loss='categorical_crossentropy', # categorical crossentropy makes sense with Softmax
metrics=['accuracy']) # we'll also measure the performance but it's NOT a loss function
Number of parameters => $432 = 36 \times 12 $. If there is bias, then it would become $36 \times 12 + 12$.
model.summary() # Let's see the network.
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_1 (Dense) (None, 12) 432 _________________________________________________________________ activation_1 (Activation) (None, 12) 0 ================================================================= Total params: 432 Trainable params: 432 Non-trainable params: 0 _________________________________________________________________
Alright, let's train it!
history = model.fit_generator(datagen, steps_per_epoch=200, epochs=25, verbose=1,
validation_data=val_datagen, validation_steps=4)
Epoch 1/25 200/200 [==============================] - 0s - loss: 2.1811 - acc: 0.5381 - val_loss: 1.8601 - val_acc: 0.9258 Epoch 2/25 200/200 [==============================] - 0s - loss: 1.6234 - acc: 0.9729 - val_loss: 1.4143 - val_acc: 1.0000 Epoch 3/25 200/200 [==============================] - 0s - loss: 1.2387 - acc: 1.0000 - val_loss: 1.0857 - val_acc: 1.0000 Epoch 4/25 200/200 [==============================] - 0s - loss: 0.9700 - acc: 1.0000 - val_loss: 0.8587 - val_acc: 1.0000 Epoch 5/25 200/200 [==============================] - 0s - loss: 0.7810 - acc: 1.0000 - val_loss: 0.7107 - val_acc: 1.0000 Epoch 6/25 200/200 [==============================] - 0s - loss: 0.6424 - acc: 1.0000 - val_loss: 0.5827 - val_acc: 1.0000 Epoch 7/25 200/200 [==============================] - 0s - loss: 0.5405 - acc: 1.0000 - val_loss: 0.5087 - val_acc: 1.0000 Epoch 8/25 200/200 [==============================] - 0s - loss: 0.4618 - acc: 1.0000 - val_loss: 0.4241 - val_acc: 1.0000 Epoch 9/25 200/200 [==============================] - 0s - loss: 0.4014 - acc: 1.0000 - val_loss: 0.3822 - val_acc: 1.0000 Epoch 10/25 200/200 [==============================] - 0s - loss: 0.3546 - acc: 1.0000 - val_loss: 0.3289 - val_acc: 1.0000 Epoch 11/25 200/200 [==============================] - 0s - loss: 0.3149 - acc: 1.0000 - val_loss: 0.3019 - val_acc: 1.0000 Epoch 12/25 200/200 [==============================] - 0s - loss: 0.2843 - acc: 1.0000 - val_loss: 0.2739 - val_acc: 1.0000 Epoch 13/25 200/200 [==============================] - 0s - loss: 0.2566 - acc: 1.0000 - val_loss: 0.2361 - val_acc: 1.0000 Epoch 14/25 200/200 [==============================] - 0s - loss: 0.2353 - acc: 1.0000 - val_loss: 0.2244 - val_acc: 1.0000 Epoch 15/25 200/200 [==============================] - 0s - loss: 0.2161 - acc: 1.0000 - val_loss: 0.2038 - val_acc: 1.0000 Epoch 16/25 200/200 [==============================] - 0s - loss: 0.2011 - acc: 1.0000 - val_loss: 0.1935 - val_acc: 1.0000 Epoch 17/25 200/200 [==============================] - 0s - loss: 0.1860 - acc: 1.0000 - val_loss: 0.1778 - val_acc: 1.0000 Epoch 18/25 200/200 [==============================] - 0s - loss: 0.1737 - acc: 1.0000 - val_loss: 0.1700 - val_acc: 1.0000 Epoch 19/25 200/200 [==============================] - 0s - loss: 0.1629 - acc: 1.0000 - val_loss: 0.1587 - val_acc: 1.0000 Epoch 20/25 200/200 [==============================] - 0s - loss: 0.1535 - acc: 1.0000 - val_loss: 0.1450 - val_acc: 1.0000 Epoch 21/25 200/200 [==============================] - 0s - loss: 0.1451 - acc: 1.0000 - val_loss: 0.1409 - val_acc: 1.0000 Epoch 22/25 200/200 [==============================] - 0s - loss: 0.1369 - acc: 1.0000 - val_loss: 0.1318 - val_acc: 1.0000 Epoch 23/25 200/200 [==============================] - 0s - loss: 0.1307 - acc: 1.0000 - val_loss: 0.1275 - val_acc: 1.0000 Epoch 24/25 200/200 [==============================] - 0s - loss: 0.1238 - acc: 1.0000 - val_loss: 0.1163 - val_acc: 1.0000 Epoch 25/25 200/200 [==============================] - 0s - loss: 0.1177 - acc: 1.0000 - val_loss: 0.1202 - val_acc: 1.0000
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['acc'], label='training')
plt.plot(history.history['val_acc'], label='validation', alpha=0.7)
plt.title('Accuracy')
plt.xlabel('epoch')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='training')
plt.plot(history.history['val_loss'], label='validation', alpha=0.7)
plt.title('Loss')
plt.xlabel('epoch')
plt.legend()
<matplotlib.legend.Legend at 0x1146a0190>
Validation set loss is similar to training loss, i.e., no overfitting.
loss = model.evaluate_generator(datagen, steps=10)
print("loss: {}, accuracy: {}".format(loss[0], loss[1]))
loss: 0.113415751606, accuracy: 1.0
weights = model.get_weights()[0] # (36, 12)
print(weights.shape)
# weights = weights / np.sum(np.abs(weights), axis=1, keepdims=True)
(36, 12)
# weights
plt.figure(figsize=(20, 5))
plt.imshow(weights.transpose(), cmap=plt.get_cmap('Blues'))
plt.colorbar()
plt.title('Visualisation of the trained weights ($W$)')
pitch_names = 'A4 A#4 B4 C5 C#5 D5 D#5 E5 F5 F#5 G5 G#5'.split(' ')
plt.yticks(range(0, 12), ['{} ({})'.format(p, str(i)) for p, i in zip(pitch_names, range(1, 13))])
plt.xticks(range(0, 36), [str(i) for i in range(1, 37)])
plt.xlabel('Frequency bands of CQT')
plt.ylabel('output')
# an example input
plt.figure(figsize=(20, 1))
plt.imshow(x[0:1], cmap=plt.get_cmap('Blues'))
plt.xticks(range(0, 36), [str(i) for i in range(1, 37)])
plt.colorbar()
plt.title('What would be the output if this CQT frame is multiplied to this weights?')
plt.yticks([])
print('')
This is visualisation of weights $\textbf{W}$.
Each row corresponeds to each output node. For example, the 1st row (A4) is connected to the 1st output node, which will be activated (has a large value) if the network thinks it is A4.
def softmax(x):
"""A softmax function that is not perfect in terms of numerical stability (it might overflow)"""
return np.exp(x) / np.exp(x).sum(axis=0)
output_for_x = softmax(np.dot(weights.transpose(), x[0]))
plt.bar(range(len(output_for_x)), output_for_x)
plt.xticks(range(len(output_for_x)), pitch_names)
plt.title('Output node values for x[0]: ')
print('Estimated pitch: {}'.format(pitch_names[np.argmax(output_for_x)]))
print('Groundtruth: {}'.format(pitch_names[np.argmax(y[0])]))
Estimated pitch: F5 Groundtruth: F5